# List of required packages
required_packages <- c(
  "tidyverse",  # Data manipulation and visualization
  "scico"       # Scientific color palettes
)

# Install any missing packages
new_packages <- required_packages[!(required_packages %in% installed.packages()[, "Package"])]
if (length(new_packages)) install.packages(new_packages)

# Load all required packages
lapply(required_packages, library, character.only = TRUE)

# Create output directories
dir.create("outputs", showWarnings = FALSE)
dir.create("outputs/tables", showWarnings = FALSE)
dir.create("outputs/figures", showWarnings = FALSE)

# Load and preprocess data
data <- read_csv("merged_survey_data.csv") |>
  drop_na(party_pref) # we remove missing values here; important

# Checking number of SALF respondents by vote intention
summary(as.factor(data$party_pref))

# Number of SALF supporters by survey and percentage
data |>
  group_by(survey) |>
  summarise(
    total_respondents = n(),
    salf_count = sum(party_pref == "SALF", na.rm = TRUE),
    salf_percentage = (salf_count / total_respondents) * 100
  )

# Number of SALF supporters when vote recall considered
data <- data |>
  mutate(party_pref_alt = ifelse(is.na(party_pref2), party_pref, party_pref2))
summary(as.factor(data$party_pref_alt))

# Check data
glimpse(data)

# Keeping only used variables
data_subset <- data |> 
  select(
  survey, # as a grouping variable
  age_group, # as a control variable
  age, # as independent variable
  gender, # as a control + independent variable
  edu_fct, # as a control + independent variable
  lr_norm, # as independent variable
  vote_intention, # as dependent variable
  vote_recall, # as dependent variable
  ptv_salf, # as dependent variable
  policy_proimmigration, # AndPol
  policy_antigenderbinary, # anti gender binary
  policy_proredistribution, # AndPol
  heteropride, # Heteropride
  democracy_equality, # dem views
  democracy_justice, # dem views
  democracy_freedom, # dem views
  democracy_welfare, # dem views
  democracy_diversity, # dem views
  democracy_insecurity, # dem views
  democracy_fraud, # dem views
  democracy_inefficacy, # dem views
  democracy_disorder, # dem views
  democracy_weakness, # dem views
  dem_index, # democracy: dem meanigs index
  dem_support_lite, # democracy: dem support binary
  monarchy_backsliding, # catalonia
  media_backsliding, # catalonia
  judiciary_backsliding, # catalonia
  supress_school_expression, # catalonia
  suspend_autonomy, # catalonia
  forbid_parties, # catalonia
  identity_comparative, # identity
  identity_region, # identity
  identity_spanish, # identity
  info_social_media, # media
  tiktok_info, # media
  telegram_info, # media
  against_redistribution, # EES
  against_restrictive_immigration, # EES
  against_same_sex_marriage, # EES
  woman_traditional, # EES
  fake_news, # media
  swd, # democracy
  external_efficacy, # democracy
  pro_ue, # international
  ue_views, # international
  assistance_ukraine, # international
  external_efficacy_ue # international
  
  
) |>
mutate(survey = recode(survey,
                       "andpol" = "Original survey",
                       "db40_june" = "40db June",
                       "db40_july" = "40db July",
                       "db40_august" = "40db August",
                       "cis_pre" = "CIS EP Pre-electoral",
                       "cis_pos" = "CIS EP Post-electoral",
                       "cis_campaign" = "CIS EP Campaign",
                       "cis_bjun" = "CIS June",
                       "cis_bjul" = "CIS July",
                       "gesop" = "GESOP",
                       "ees" = "EES 2024"
))

# Define the variable order
variable_order <- c(
  "age","age_group", "gender", "edu_fct", "lr_norm", "vote_intention",
  "vote_recall",  "ptv_salf", "policy_proredistribution", 
  "against_redistribution", "policy_proimmigration",
  "against_restrictive_immigration", 
  "against_same_sex_marriage", "woman_traditional", "policy_antigenderbinary",
  "heteropride", "pro_ue", "ue_views", "external_efficacy_ue",
  "assistance_ukraine",   "swd", "external_efficacy",
  "democracy_equality",  "democracy_justice",  
  "democracy_freedom", "democracy_welfare",  "democracy_diversity", 
  "democracy_insecurity",   "democracy_fraud",  "democracy_inefficacy",
  "democracy_disorder", "democracy_weakness",  "dem_support_lite", "dem_index",
  "monarchy_backsliding", "media_backsliding",  "judiciary_backsliding", 
  "supress_school_expression", "suspend_autonomy",  "forbid_parties", 
  "identity_comparative", "identity_region", "identity_spanish",
  "info_social_media",  "tiktok_info", "telegram_info", "fake_news"
)

# Compute missing values percentage
survey_variables <- data_subset |>
  group_by(survey) |>
  summarise(across(everything(), ~sum(is.na(.)) / n() * 100)) |>  # Now calculating % of missing values
  pivot_longer(-survey, names_to = "variable", values_to = "missing_percentage") |>
  mutate(variable = factor(variable, levels = variable_order))  # Set factor levels for ordered display

# Visualize missing data percentage
variables_availability_plot <- ggplot(survey_variables |> filter(missing_percentage < 100),  # Exclude fully missing variables
       aes(x = variable, y = survey, fill = missing_percentage)) +
  geom_tile(color = "white") +
  scale_fill_scico(palette = "berlin", direction = -1)+  # Use "magma" for better contrast
  theme_minimal(base_size = 14) +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
    legend.position = "right",
    panel.grid.minor = element_blank(),
    strip.text = element_text(size = 12, face = "bold")
  ) +
  labs(
    title = "Variable availability and missing values % by survey",
    x = "Variable",
    y = "Survey",
    fill = "% Missing"
  ) 
variables_availability_plot

# Save the plot
ggsave(file.path("outputs/figures", "variables_availability.png"), 
       variables_availability_plot, width = 10, height = 8, bg = "white")

